import pandas as pd
import pyarrow.parquet as pq
import plotly
import numpy as np
import scipy
from scipy.fftpack import fft
from scipy.signal import butter, lfilter, freqz
plotly.tools.set_credentials_file(username="majidpy", api_key="rW7nc9CHax4Z6NugEDvT")
import plotly.graph_objs as go
import plotly.plotly as py
plotly.offline.init_notebook_mode(connected=True)
# reading meta data. A table which contains measurement IDs and target values.
train_meta = pd.read_csv('metadata_train.csv')
print('There are {} entries, {} of which are labeled as damaged line'.format(
train_meta.shape[0], train_meta.loc[train_meta['target']==1].shape[0]))
id_measurement: the ID code for a trio of signals recorded at the same time.
signal_id: the foreign key for the signal data. Each signal ID is unique across both train and test, so the first ID in train is '0' but the first ID in test is '8712'.
phase: the phase ID code within the signal trio. The phases may or may not all be impacted by a fault on the line.
target: 0 if the power line is undamaged, 1 if there is a fault.
train_meta.head()
# Let's seperate damaged lines
damaged_data = train_meta.loc[train_meta['target']==1]
# Some of the damaged data have all 3 phases marked as damaged, some may only have
# 1 or 2 phases. So, let's first find those with 3 phases.
temp_ = damaged_data.groupby(by=['id_measurement']).aggregate({'target':'count'}).reset_index()
id_3phases = temp_.loc[temp_['target']==3]['id_measurement'].values
damaged_data_3phases = damaged_data.loc[damaged_data['id_measurement'].isin(id_3phases)]
# now those who may have 1 or 2 phases marked as damaged
damaged_data_12phases = damaged_data.loc[~damaged_data['id_measurement'].isin(id_3phases)]
# reading train.parquet data for only those ids with damaged signal
train_damaged = pq.read_pandas('train.parquet',
columns=[str(i) for i in damaged_data['signal_id']]).to_pandas()
# Number of undamaged signals is high. ~8200 ids, each of which contains 800,000 sample.
# So, we read portion of them until training.
train_undamaged = pd.DataFrame()
# function to read undamaged lines. Since, each line has 800,000 measurements,
# reading all undamaged lines takes a lot of memory
def add_undamaged(signal_id):
# inputs:
# signal_id: list of signal_id which are undamaged
# output:
# Dataframe with the new signal_ids appended to the end
temp_ = pq.read_pandas('train.parquet',
columns=[str(i) for i in signal_id]).to_pandas()
for col in temp_.columns:
train_undamaged[col] = temp_[col]
return train_undamaged
# function for plotting time signals
def get_plot_data(idxs):
# input:
# idxs: list of strings containing signal_ids
# outputs:
# data: an array of data for plotly
check_cols = train_damaged.columns.append(train_undamaged.columns)
get_cols = []
i = 0
data = [] # array of data for plotly
for idx in idxs:
if idx in train_damaged.columns:
temp_ = go.Scatter(y = train_damaged[idx],
mode = "lines",name = "damged_"+idx)
data.append(temp_)
elif idx in train_undamaged.columns:
temp_ = go.Scatter(y = train_undamaged[idx],
mode = "lines",name = "undamged_"+idx)
data.append(temp_)
else:
get_cols.append(idx)
temp_ = add_undamaged(get_cols)
for idx in get_cols:
temp_ = go.Scatter(y = train_undamaged[idx],
mode = "lines",name = "undamged_"+idx)
data.append(temp_)
return data
# example of two signals with similar patterns, one is damaged other is not
data = get_plot_data(['228', '6'])
fig = dict(data = data)
plotly.offline.iplot(fig)
# Example of three signals on the same measurement with different phases. Only one phase
# is labeled as damged.
data = get_plot_data(['288', '289', '290'])
fig = dict(data = data)
plotly.offline.iplot(fig)